In this R notebook we are going to explore the data analytics and data visualization power of R.

In this example we are going to analyze the heart disease database from UCI machine library.

The dataset contains 76 predictors(features) and 303 observations. Patients with heart disease is binary coded as Presence given as 1 and No Presence as 0. The prerequiste to run in R Markdown is download the CSV data file in your working directory. This can be done by setting the current working directory as folows in R chunk: setwd("C:\\Users\\RajuPC\\Documents\\MyR")

First load the supporting R libraries

setwd("C:\\Users\\RajuPC\\Documents\\MyR") # Setting Woring Directory
library(tidyverse) #A high efficient data viz and manipulation R Library
library(caret) # A collection of Machine Learning Libraries
library(plotly) #A interaction Graphing System
library(ggsci) # A great collection of themes for ggplot

Loading of UCI heart disease data.

#Load the CSV data file
hci<-read_csv("heart.csv")
## Parsed with column specification:
## cols(
##   age = col_integer(),
##   sex = col_integer(),
##   cp = col_integer(),
##   trestbps = col_integer(),
##   chol = col_integer(),
##   fbs = col_integer(),
##   restecg = col_integer(),
##   thalach = col_integer(),
##   exang = col_integer(),
##   oldpeak = col_double(),
##   slope = col_integer(),
##   ca = col_integer(),
##   thal = col_integer(),
##   target = col_integer()
## )
hci$sex <- as.character(hci$sex)
hci$sex[hci$sex== 1] <- "Male"
hci$sex[hci$sex== 0] <- "Female"

summary(hci)
##       age            sex                  cp           trestbps    
##  Min.   :29.00   Length:303         Min.   :0.000   Min.   : 94.0  
##  1st Qu.:47.50   Class :character   1st Qu.:0.000   1st Qu.:120.0  
##  Median :55.00   Mode  :character   Median :1.000   Median :130.0  
##  Mean   :54.37                      Mean   :0.967   Mean   :131.6  
##  3rd Qu.:61.00                      3rd Qu.:2.000   3rd Qu.:140.0  
##  Max.   :77.00                      Max.   :3.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.5  
##  Median :240.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :246.3   Mean   :0.1485   Mean   :0.5281   Mean   :149.6  
##  3rd Qu.:274.5   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak         slope             ca        
##  Min.   :0.0000   Min.   :0.00   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.80   Median :1.000   Median :0.0000  
##  Mean   :0.3267   Mean   :1.04   Mean   :1.399   Mean   :0.7294  
##  3rd Qu.:1.0000   3rd Qu.:1.60   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.20   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.314   Mean   :0.5446  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000
tbl_df(hci)# A nicer view of the data as a table 
## # A tibble: 303 x 14
##      age sex      cp trestbps  chol   fbs restecg thalach exang oldpeak
##    <int> <chr> <int>    <int> <int> <int>   <int>   <int> <int>   <dbl>
##  1    63 Male      3      145   233     1       0     150     0     2.3
##  2    37 Male      2      130   250     0       1     187     0     3.5
##  3    41 Fema~     1      130   204     0       0     172     0     1.4
##  4    56 Male      1      120   236     0       1     178     0     0.8
##  5    57 Fema~     0      120   354     0       1     163     1     0.6
##  6    57 Male      0      140   192     0       1     148     0     0.4
##  7    56 Fema~     1      140   294     0       0     153     0     1.3
##  8    44 Male      1      120   263     0       1     173     0     0  
##  9    52 Male      2      172   199     1       1     162     0     0.5
## 10    57 Male      2      150   168     0       1     174     0     1.6
## # ... with 293 more rows, and 4 more variables: slope <int>, ca <int>,
## #   thal <int>, target <int>

Convert following predictors as factor for plotting

#Convert following predictors as factor for plotting
hci$sex<-as.factor(hci$sex)
hci$cp<-as.factor(hci$cp)
hci$thal<-as.factor(hci$thal)
hci$ca<-as.factor(hci$ca)

Distribution of Male and Female population across Age parameter

ggplotly(p1<-hci %>% ggplot(aes(x=age,fill=sex))+geom_bar()+xlab("Age") + 
           ylab("Number")+ guides(fill = guide_legend(title = "Gender"))
)%>%   layout(legend = list(orientation = "h", x = 0, y = 1))

Representation of Cholestoral level

p2<-hci %>% ggplot(aes(x=age,y=chol,fill=sex, size=chol))+geom_point(alpha=0.7)+xlab("Age") + 
           ylab("Cholestoral")+ scale_fill_npg()+guides(fill = guide_legend(title = "Gender"))+
 theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
ggplotly(p2)%>%  layout(legend = list(orientation = "h", x = 0, y = 1))

Representation of Cholestoral level across different defect conditions

p3<-hci %>% ggplot(aes(x=age,y=chol,fill=sex, size=chol))+geom_point(alpha=0.7)+xlab("Age") + 
           ylab("Cholestoral")+facet_grid(.~fbs)+
 theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
#ggsave("p3.png",plot=p3,dpi=300) To save the plot
ggplotly(p3)%>%layout(legend = list(orientation = "h", x = 0, y = 1))

Comparison of Blood pressure across pain type (0~3)

p4<-hci%>%ggplot(aes(x=sex,y=trestbps))+geom_boxplot(fill="darkorange")+xlab("Sex")+ylab("BP")+facet_grid(~cp)
ggplotly(p4)

Comparison of Cholestoral across pain type (0~3)

p5<-hci%>%ggplot(aes(x=sex,y=chol))+geom_boxplot(fill="#D55E00")+xlab("Sex")+ylab("Chol")+facet_grid(~cp)
ggplotly(p5)

Relation between Gender, Age, Cholestoral, BP

# Scatterplot
gg <- ggplot(hci, aes(x=age, y=chol, col=sex)) +
  geom_point(aes( size=trestbps),shape=1,alpha=0.6) +  theme_bw()+
  geom_smooth(method="loess", se=F) +theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
 ggplotly(gg)%>%layout(legend = list(orientation = "h", x = 0, y = 1))

Detection of heart disease using Machine learning methods

First the data is partitioned into training and test datasets

# Create the training and test datasets
set.seed(100)
hci<-read_csv("heart.csv")
## Parsed with column specification:
## cols(
##   age = col_integer(),
##   sex = col_integer(),
##   cp = col_integer(),
##   trestbps = col_integer(),
##   chol = col_integer(),
##   fbs = col_integer(),
##   restecg = col_integer(),
##   thalach = col_integer(),
##   exang = col_integer(),
##   oldpeak = col_double(),
##   slope = col_integer(),
##   ca = col_integer(),
##   thal = col_integer(),
##   target = col_integer()
## )
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$target, p=0.8, list=FALSE)

# Step 2: Create the training  dataset
trainData <- hci[trainRowNumbers,]

# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]

# Store X and Y for later use.
x = trainData[, 1:13]
trainData$target[trainData$target==1]<-"P"
trainData$target[trainData$target==0]<-"N"
y=trainData$target
testData$target[testData$target==1]<-"P"
testData$target[testData$target==0]<-"N"

yt=testData$target
# # See the structure of the new dataset

Normalization of features

preProcess_range_model <- preProcess(trainData, method='range')
preProcess_range_model1 <- preProcess(testData, method='range')

trainData <- predict(preProcess_range_model, newdata = trainData)
testData <- predict(preProcess_range_model1, newdata = testData)

# Append the Y variable
trainData$target <- as.factor(y)
testData$target<-as.factor(yt)
#apply(trainData[, 1:13], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
str(trainData)
## Classes 'tbl_df', 'tbl' and 'data.frame':    243 obs. of  14 variables:
##  $ age     : num  0.25 0.562 0.583 0.562 0.312 ...
##  $ sex     : num  0 1 1 0 1 1 1 1 0 0 ...
##  $ cp      : num  0.333 0.333 0 0.333 0.333 ...
##  $ trestbps: num  0.34 0.245 0.434 0.434 0.245 ...
##  $ chol    : num  0.178 0.251 0.151 0.384 0.313 ...
##  $ fbs     : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ restecg : num  0 0.5 0.5 0 0.5 0.5 0.5 0 0 0.5 ...
##  $ thalach : num  0.771 0.817 0.588 0.626 0.779 ...
##  $ exang   : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ oldpeak : num  0.2258 0.129 0.0645 0.2097 0 ...
##  $ slope   : num  1 1 0.5 0.5 1 1 1 0.5 1 0.5 ...
##  $ ca      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : num  0.667 0.667 0.333 0.667 1 ...
##  $ target  : Factor w/ 2 levels "N","P": 2 2 2 2 2 2 2 2 2 2 ...
str(testData)
## Classes 'tbl_df', 'tbl' and 'data.frame':    60 obs. of  14 variables:
##  $ age     : num  0.8235 0.0588 0.6471 0.6471 0.5588 ...
##  $ sex     : num  1 1 0 1 1 0 1 1 0 1 ...
##  $ cp      : num  1 0.667 0 0.667 0 ...
##  $ trestbps: num  0.593 0.419 0.302 0.651 0.535 ...
##  $ chol    : num  0.26693 0.33466 0.749 0.00797 0.29084 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 1 1 ...
##  $ restecg : num  0 1 1 1 1 1 1 1 0 0 ...
##  $ thalach : num  0.619 1 0.753 0.866 0.722 ...
##  $ exang   : num  0 0 1 0 0 0 0 1 0 0 ...
##  $ oldpeak : num  0.411 0.625 0.107 0.286 0.214 ...
##  $ slope   : num  0 0 1 1 1 1 1 1 1 0 ...
##  $ ca      : num  0 0 0 0 0 0 0 0 0.25 0 ...
##  $ thal    : num  0 0.5 0.5 0.5 0.5 0.5 0.5 1 0.5 0.5 ...
##  $ target  : Factor w/ 2 levels "N","P": 2 2 2 2 2 2 2 2 2 2 ...

Detection of Heart disease by Earth ML method present in caret package

#fit control
fitControl <- trainControl(
  method = 'cv',                   # k-fold cross validation
  number = 5,                      # number of folds
  savePredictions = 'final',       # saves predictions for optimal tuning parameter
  classProbs = T,                  # should class probabilities be returned
  summaryFunction=twoClassSummary  # results summary function
) 

# Step 1: Tune hyper parameters by setting tuneLength
set.seed(100)
model_mars2 = train(target ~ ., data=trainData, method='earth', tuneLength = 5, metric='ROC', trControl = fitControl)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
## 
## Attaching package: 'TeachingDemos'
## The following object is masked from 'package:plotly':
## 
##     subplot
model_mars2
## Multivariate Adaptive Regression Spline 
## 
## 243 samples
##  13 predictor
##   2 classes: 'N', 'P' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 194, 194, 195, 195, 194 
## Resampling results across tuning parameters:
## 
##   nprune  ROC        Sens       Spec     
##    2      0.7904281  0.7956710  0.7851852
##    5      0.8950297  0.7766234  0.8444444
##    9      0.9031826  0.7952381  0.8666667
##   13      0.9031826  0.7952381  0.8666667
##   17      0.9031826  0.7952381  0.8666667
## 
## Tuning parameter 'degree' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 9 and degree = 1.
# Step 2: Predict on testData and Compute the confusion matrix
predicted2 <- predict(model_mars2, testData)
confusionMatrix(reference = testData$target, data = predicted2, mode='everything')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  P
##          N 24  6
##          P  6 24
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.6767, 0.8922)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.592e-06       
##                                           
##                   Kappa : 0.6             
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8             
##             Specificity : 0.8             
##          Pos Pred Value : 0.8             
##          Neg Pred Value : 0.8             
##               Precision : 0.8             
##                  Recall : 0.8             
##                      F1 : 0.8             
##              Prevalence : 0.5             
##          Detection Rate : 0.4             
##    Detection Prevalence : 0.5             
##       Balanced Accuracy : 0.8             
##                                           
##        'Positive' Class : N               
## 

Comparison of some common ML methods using Models_Compare method

# Train the model using adaboost
model_adaboost = train(target ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model1 = train(target ~ ., data=trainData, method='knn', tuneLength=2, trControl = fitControl)#KNN Model
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model2 = train(target ~ ., data=trainData, method='svmRadial', tuneLength=2, trControl = fitControl)#SVM
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model2 = train(target ~ ., data=trainData, method='rpart', tuneLength=2, trControl = fitControl)#RandomForest
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
# Compare model performances using resample()
models_compare <- resamples(list(EARTH=model_mars2,ADABOOST=model_adaboost, KNN=model1,SVM=model2, RanF=model2))

# Summary of the models performances
summary(models_compare)
## 
## Call:
## summary.resamples(object = models_compare)
## 
## Models: EARTH, ADABOOST, KNN, SVM, RanF 
## Number of resamples: 5 
## 
## ROC 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## EARTH    0.8668430 0.8989899 0.9048822 0.9031826 0.9175084 0.9276896    0
## ADABOOST 0.8451178 0.8483245 0.8569024 0.8618246 0.8783069 0.8804714    0
## KNN      0.7989418 0.8552189 0.8821549 0.8865961 0.9478114 0.9488536    0
## SVM      0.6693122 0.7154882 0.7710438 0.7625541 0.8227513 0.8341751    0
## RanF     0.6693122 0.7154882 0.7710438 0.7625541 0.8227513 0.8341751    0
## 
## Sens 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## EARTH    0.6190476 0.7727273 0.8571429 0.7952381 0.8636364 0.8636364    0
## ADABOOST 0.6818182 0.7142857 0.7272727 0.7316017 0.7619048 0.7727273    0
## KNN      0.6666667 0.7272727 0.8095238 0.7770563 0.8181818 0.8636364    0
## SVM      0.5714286 0.7272727 0.7272727 0.7406926 0.7727273 0.9047619    0
## RanF     0.5714286 0.7272727 0.7272727 0.7406926 0.7727273 0.9047619    0
## 
## Spec 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## EARTH    0.8148148 0.8518519 0.8518519 0.8666667 0.8518519 0.9629630    0
## ADABOOST 0.7407407 0.7777778 0.7777778 0.7925926 0.8148148 0.8518519    0
## KNN      0.7407407 0.8148148 0.8888889 0.8666667 0.8888889 1.0000000    0
## SVM      0.7037037 0.7407407 0.8148148 0.8296296 0.9259259 0.9629630    0
## RanF     0.7037037 0.7407407 0.8148148 0.8296296 0.9259259 0.9629630    0
# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)